In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)
In [2]:
def dame_variables_categoricas(dataset=None):
    '''
    ----------------------------------------------------------------------------------------------------------
    Función dame_variables_categoricas:
    ----------------------------------------------------------------------------------------------------------
        -Descripción: Función que recibe un dataset y devuelve una lista con los nombres de las 
        variables categóricas
        -Inputs: 
            -- dataset: Pandas dataframe que contiene los datos
        -Return:
            -- lista_variables_categoricas: lista con los nombres de las variables categóricas del
            dataset de entrada con menos de 100 valores diferentes
            -- 1: la ejecución es incorrecta
    '''
    if dataset is None:
        print(u'\nFaltan argumentos por pasar a la función')
        return 1
    lista_variables_categoricas = []
    other = []
    for i in dataset.columns:
        if (dataset[i].dtype!=float) & (dataset[i].dtype!=int):
            unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
            if unicos < 100:
                lista_variables_categoricas.append(i)
            else:
                other.append(i)

    return lista_variables_categoricas, other

Cargo el archivo¶

In [3]:
df_fraud = pd.read_csv("../data/Base.csv")
df_fraud.head()
Out[3]:
fraud_bool income name_email_similarity prev_address_months_count current_address_months_count customer_age days_since_request intended_balcon_amount payment_type zip_count_4w velocity_6h velocity_24h velocity_4w bank_branch_count_8w date_of_birth_distinct_emails_4w employment_status credit_risk_score email_is_free housing_status phone_home_valid phone_mobile_valid bank_months_count has_other_cards proposed_credit_limit foreign_request source session_length_in_minutes device_os keep_alive_session device_distinct_emails_8w device_fraud_count month
0 1 0.9 0.166828 -1 88 50 0.020925 -1.331345 AA 769 10650.765523 3134.319630 3863.647740 1 6 CA 185 0 BA 1 0 24 0 500.0 0 INTERNET 3.888115 windows 0 1 0 7
1 1 0.9 0.296286 -1 144 50 0.005418 -0.816224 AB 366 534.047319 2670.918292 3124.298166 718 3 CA 259 1 BA 0 0 15 0 1500.0 0 INTERNET 31.798819 windows 0 1 0 7
2 1 0.9 0.044985 -1 132 40 3.108549 -0.755728 AC 870 4048.534263 2893.621498 3159.590679 1 14 CB 177 1 BA 0 1 -1 0 200.0 0 INTERNET 4.728705 other 0 1 0 7
3 1 0.9 0.159511 -1 22 50 0.019079 -1.205124 AB 810 3457.064063 4054.908412 3022.261812 1921 6 CA 110 1 BA 0 1 31 1 200.0 0 INTERNET 2.047904 linux 0 1 0 7
4 1 0.9 0.596414 -1 218 50 0.004441 -0.773276 AB 890 5020.341679 2728.237159 3087.670952 1990 2 CA 295 1 BA 1 0 31 0 1500.0 0 INTERNET 3.775225 macintosh 1 1 0 7

Objetivos del trabajo

Queremos hacer un análisis de la posibilidad de fraude de los clientes que se han abierto una cuenta en el banco en los últimos ocho meses. Este es uno de los grandes análisis que deben hacer las entidades financieras y es lo que debemos resolver.

Separación test y train:¶

In [4]:
from sklearn.model_selection import train_test_split

X = df_fraud.drop(columns=['fraud_bool'], inplace=False)

y = df_fraud['fraud_bool']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=69)

from sklearn.model_selection import train_test_split

df_fraud_train, df_fraud_test = train_test_split(df_fraud, test_size=0.2, random_state=69)

Analisis generales de la tabla¶

Dimensión¶

In [5]:
dimension=df_fraud.shape, df_fraud.drop_duplicates().shape
dimension
Out[5]:
((1000000, 32), (1000000, 32))
In [6]:
dimension_train=X_train.shape, X_train.drop_duplicates().shape
dimension_train
Out[6]:
((800000, 31), (800000, 31))
In [7]:
X_train.dtypes
Out[7]:
income                              float64
name_email_similarity               float64
prev_address_months_count             int64
current_address_months_count          int64
customer_age                          int64
days_since_request                  float64
intended_balcon_amount              float64
payment_type                         object
zip_count_4w                          int64
velocity_6h                         float64
velocity_24h                        float64
velocity_4w                         float64
bank_branch_count_8w                  int64
date_of_birth_distinct_emails_4w      int64
employment_status                    object
credit_risk_score                     int64
email_is_free                         int64
housing_status                       object
phone_home_valid                      int64
phone_mobile_valid                    int64
bank_months_count                     int64
has_other_cards                       int64
proposed_credit_limit               float64
foreign_request                       int64
source                               object
session_length_in_minutes           float64
device_os                            object
keep_alive_session                    int64
device_distinct_emails_8w             int64
device_fraud_count                    int64
month                                 int64
dtype: object
In [8]:
df_fraud_fraud_bool = df_fraud['fraud_bool']\
        .value_counts(normalize=True)\
        .mul(100).rename('percent').reset_index()

df_fraud_fraud_bool_conteo = df_fraud['fraud_bool'].value_counts().reset_index()
df_fraud_fraud_bool_pc = pd.merge(df_fraud_fraud_bool, df_fraud_fraud_bool_conteo, on=['index'], how='inner')
df_fraud_fraud_bool_pc
Out[8]:
index percent fraud_bool
0 0 98.8971 988971
1 1 1.1029 11029
In [9]:
fig = px.histogram(df_fraud_fraud_bool_pc, x="index", y=['percent'])
fig.show()

Gracias al gráfico y a la tabla anterior, observamos que el porcentaje de fraude dentro de nuestros datos es del 1,1%.

Ponemos los números nulos representados con "-1" o valores negativos a "NA".¶

In [10]:
columns_with_special_missing_values = {
    'prev_address_months_count': -1,
    'current_address_months_count': -1,
    'intended_balcon_amount': 0,  # Ahora usamos 0 para representar valores menores a 0
    'bank_months_count': -1,
    'session_length_in_minutes': -1,
    'device_distinct_emails_8w': -1
}

# Sustituir valores nulos (representados con -1 o con valores negativos)
for col, special_value in columns_with_special_missing_values.items():
    if special_value is not None:
        if col == 'intended_balcon_amount':
            df_fraud[col] = df_fraud[col].apply(lambda x: x if x >= special_value else pd.NA)
        else:
            df_fraud[col] = df_fraud[col].apply(lambda x: x if x != special_value else pd.NA)

Selección de threshold por filas y columnas¶

In [11]:
pd_series_null_columns = df_fraud.isnull().sum().sort_values(ascending=False)
pd_series_null_rows = df_fraud.isnull().sum(axis=1).sort_values(ascending=False)
print(pd_series_null_columns.shape, pd_series_null_rows.shape)

pd_null_columnas = pd.DataFrame(pd_series_null_columns, columns=['nulos_columnas'])     
pd_null_filas = pd.DataFrame(pd_series_null_rows, columns=['nulos_filas'])  
pd_null_filas['target'] = df_fraud['fraud_bool'].copy()
pd_null_columnas['porcentaje_columnas'] = pd_null_columnas['nulos_columnas']/df_fraud.shape[0]
pd_null_filas['porcentaje_filas']= pd_null_filas['nulos_filas']/df_fraud.shape[1]
(32,) (1000000,)
In [12]:
df_fraud.shape
Out[12]:
(1000000, 32)
In [13]:
pd_null_columnas
Out[13]:
nulos_columnas porcentaje_columnas
intended_balcon_amount 742523 0.742523
prev_address_months_count 712920 0.712920
bank_months_count 253635 0.253635
current_address_months_count 4254 0.004254
session_length_in_minutes 2015 0.002015
device_distinct_emails_8w 359 0.000359
fraud_bool 0 0.000000
foreign_request 0 0.000000
phone_mobile_valid 0 0.000000
has_other_cards 0 0.000000
proposed_credit_limit 0 0.000000
device_os 0 0.000000
source 0 0.000000
housing_status 0 0.000000
keep_alive_session 0 0.000000
device_fraud_count 0 0.000000
phone_home_valid 0 0.000000
credit_risk_score 0 0.000000
email_is_free 0 0.000000
income 0 0.000000
employment_status 0 0.000000
date_of_birth_distinct_emails_4w 0 0.000000
bank_branch_count_8w 0 0.000000
velocity_4w 0 0.000000
velocity_24h 0 0.000000
velocity_6h 0 0.000000
zip_count_4w 0 0.000000
payment_type 0 0.000000
days_since_request 0 0.000000
customer_age 0 0.000000
name_email_similarity 0 0.000000
month 0 0.000000
In [14]:
threshold=0.9
list_vars_not_null = list(pd_null_columnas[pd_null_columnas['porcentaje_columnas']<threshold].index)
df_fraud_filter_null = df_fraud.loc[:, list_vars_not_null]
df_fraud_filter_null.shape
Out[14]:
(1000000, 32)
In [15]:
pd_null_filas
Out[15]:
nulos_filas target porcentaje_filas
415911 5 0 0.15625
539043 5 0 0.15625
315076 5 0 0.15625
980028 5 0 0.15625
281065 5 0 0.15625
... ... ... ...
285543 0 0 0.00000
285550 0 0 0.00000
285551 0 0 0.00000
878673 0 0 0.00000
500000 0 0 0.00000

1000000 rows × 3 columns

Tipos: Variables categoricas y numericas¶

In [16]:
list_cat_vars, other = dame_variables_categoricas(dataset=df_fraud_filter_null)
df_fraud_filter_null[list_cat_vars] = df_fraud_filter_null[list_cat_vars].astype("category")
df_fraud_filter_null[list_cat_vars].head()
Out[16]:
bank_months_count device_distinct_emails_8w fraud_bool foreign_request phone_mobile_valid has_other_cards device_os source housing_status keep_alive_session device_fraud_count phone_home_valid email_is_free employment_status date_of_birth_distinct_emails_4w payment_type customer_age month
0 24 1 1 0 0 0 windows INTERNET BA 0 0 1 0 CA 6 AA 50 7
1 15 1 1 0 0 0 windows INTERNET BA 0 0 0 1 CA 3 AB 50 7
2 NaN 1 1 0 1 0 other INTERNET BA 0 0 0 1 CB 14 AC 40 7
3 31 1 1 0 1 1 linux INTERNET BA 0 0 0 1 CA 6 AB 50 7
4 31 1 1 0 0 0 macintosh INTERNET BA 1 0 1 1 CA 2 AB 50 7
In [17]:
list_cat_vars
Out[17]:
['bank_months_count',
 'device_distinct_emails_8w',
 'fraud_bool',
 'foreign_request',
 'phone_mobile_valid',
 'has_other_cards',
 'device_os',
 'source',
 'housing_status',
 'keep_alive_session',
 'device_fraud_count',
 'phone_home_valid',
 'email_is_free',
 'employment_status',
 'date_of_birth_distinct_emails_4w',
 'payment_type',
 'customer_age',
 'month']
In [18]:
df_fraud_filter_null['employment_status'].value_counts()
Out[18]:
CA    730252
CB    138288
CF     44034
CC     37758
CD     26522
CE     22693
CG       453
Name: employment_status, dtype: int64

Tal y como podemos observar, la mayoría de nulos se encuentran principalmente en los "employment status" CA y CB.

Guardamos el CSV con los cambios realizados:¶

In [19]:
df_fraud.to_csv("../data/df_fraud_na.csv")